{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3L.h5\n",
    "# ftp://ngs.sanger.ac.uk/production/ag1000g/phase1/AR3/variation/crosses/ar3/hdf5/ag1000g.crosses.phase1.ar3sites.3R.h5\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "import gzip\n",
    "import random\n",
    "\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import pandas as pd\n",
    "\n",
    "from sklearn import preprocessing \n",
    "\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "mendelian_errors = pickle.load(gzip.open('mendelian_errors.pickle.gz', 'rb'))\n",
    "feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb'))\n",
    "ordered_features = np.load(open('ordered_features', 'rb'))\n",
    "num_features = len(ordered_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10905732 10364044 541688 4.9670026734564905\n"
     ]
    }
   ],
   "source": [
    "total_observations = len(mendelian_errors)\n",
    "error_observations = len(list(filter(lambda x: x[0] > 0,mendelian_errors.values())))\n",
    "ok_observations = total_observations - error_observations\n",
    "fraction_errors = error_observations/total_observations\n",
    "print (total_observations, ok_observations, error_observations, 100*fraction_errors)\n",
    "del mendelian_errors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = preprocessing.StandardScaler(copy=False)\n",
    "scaler.fit(feature_fit[:,:-1])\n",
    "scaler.transform(feature_fit[:,:-1])\n",
    "#Original feature fit is now lost\n",
    "np.save(gzip.open('scaled_fit.npy.gz', 'wb'), feature_fit, allow_pickle=False, fix_imports=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "151.27928857724643"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "feature_fit[:,9].max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_fit = np.load(gzip.open('feature_fit.npy.gz', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "#total_observations*fraction_errors\n",
    "prob_ok_choice = error_observations / ok_observations\n",
    "\n",
    "def accept_entry(row):\n",
    "    if row[-1] == 1:\n",
    "        return True\n",
    "    return random.random() <= prob_ok_choice\n",
    "\n",
    "accept_entry_v = np.vectorize(accept_entry, signature='(i)->()')\n",
    "\n",
    "accepted_entries = accept_entry_v(feature_fit)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(541688, 542199)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "balanced_fit = feature_fit[accepted_entries]\n",
    "del feature_fit\n",
    "balanced_fit.shape\n",
    "len([x for x in balanced_fit if x[-1] == 1]), len([x for x in balanced_fit if x[-1] == 0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(gzip.open('balanced_fit.npy.gz', 'wb'), balanced_fit, allow_pickle=False, fix_imports=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[23297.447150856133,\n",
       " 353.6483397254511,\n",
       " 2203.156293967914,\n",
       " 26.443352529058448,\n",
       " 0.7400051850423522,\n",
       " 6.428653959422727,\n",
       " 48.26200237908218,\n",
       " 27.39395435132998,\n",
       " 6.140998517407471,\n",
       " 6909.07052502565,\n",
       " -0.21665604902955368,\n",
       " 0.499764274320109]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[balanced_fit[:, column].mean() for column in range(num_features + 1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "#scaler = preprocessing.StandardScaler(copy=False) #Do not do this\n",
    "#scaler.fit(balanced_fit[:,:-1])  # especially this!!!!\n",
    "scaler.transform(balanced_fit[:,:-1])\n",
    "#Original balanced fit is now lost\n",
    "scaled_balanced_fit = balanced_fit # Better naming"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[-0.11924974054618583,\n",
       " 0.05537257269568515,\n",
       " -0.1261142725981237,\n",
       " 0.4984897988431588,\n",
       " 0.07034404738278016,\n",
       " 0.2103768017770095,\n",
       " -0.19192075229384115,\n",
       " 0.053775154282361515,\n",
       " 0.9047553788755753,\n",
       " 0.6011860427175225,\n",
       " -0.10151509888827716,\n",
       " 0.499764274320109]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[scaled_balanced_fit[:, column].mean() for column in range(num_features + 1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(gzip.open('scaled_balanced_fit.npy.gz', 'wb'), scaled_balanced_fit, allow_pickle=False, fix_imports=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(1083887, 12)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "scaled_balanced_fit.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "cv_indexes = np.random.random(scaled_balanced_fit.shape[0]) > 0.8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaled_balanced_fit_cv = balanced_fit[cv_indexes]\n",
    "scaled_balanced_fit_100_test = balanced_fit[~cv_indexes]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(gzip.open('scaled_balanced_fit_cv.npy.gz', 'wb'), scaled_balanced_fit_cv, allow_pickle=False, fix_imports=False)\n",
    "np.save(gzip.open('scaled_balanced_fit_100_test.npy.gz', 'wb'), scaled_balanced_fit_100_test, allow_pickle=False, fix_imports=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.save(gzip.open('scaled_balanced_fit_10_test.npy.gz', 'wb'), scaled_balanced_fit_100_test[\n",
    "    np.random.random(scaled_balanced_fit_100_test.shape[0]) <= 0.1], allow_pickle=False, fix_imports=False)\n",
    "np.save(gzip.open('scaled_balanced_fit_1_test.npy.gz', 'wb'), scaled_balanced_fit_100_test[\n",
    "    np.random.random(scaled_balanced_fit_100_test.shape[0]) <= 0.01], allow_pickle=False, fix_imports=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
